import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.neighbors import KNeighborsClassifier
from scipy.stats import zscore
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.impute import SimpleImputer
from sklearn.metrics import confusion_matrix,accuracy_score
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
cData=pd.read_csv('Data-Parkinsons.csv')
cData.head()
print(cData.columns)
cData.isnull().values.any()
cData.info()
cData.describe()
Target Variable is 'Status'. There is no need for 'name' column for further analysis since 'name' is just series alphanumeric variables
sns.distplot(cData['MDVP:Fhi(Hz)'])
sns.distplot(cData['MDVP:Jitter(%)'])
sns.distplot(cData['MDVP:Jitter(Abs)'])
sns.distplot(cData['MDVP:RAP'])
sns.distplot(cData['MDVP:PPQ'])
sns.distplot(cData['MDVP:Flo(Hz)'])
sns.distplot(cData['Jitter:DDP'])
sns.distplot(cData['MDVP:Shimmer'])
sns.distplot(cData['Shimmer:APQ3'])
sns.distplot(cData['Shimmer:APQ5'])
sns.distplot(cData['MDVP:APQ'])
sns.distplot(cData['Shimmer:DDA'])
sns.distplot(cData['NHR'])
sns.distplot(cData['HNR'])
sns.countplot(cData['status'])
sns.distplot(cData['RPDE'])
sns.distplot(cData['DFA'])
sns.distplot(cData['spread1'])
sns.distplot(cData['spread2'])
sns.distplot(cData['D2'])
It is optimal that we can use heat map to find the correlations coefficient values. We will remove the less correlation coefficient columns. We can remove the irrelavant features it will minimize the accuracy of an algorithm. It will be better if we take relavent features columns then we can achive to get good accuracy..
# correlation coefficient values in each attributes.
correlation_values=cData.corr()['status']
correlation_values.abs().sort_values(ascending=False)
corr_map=cData.corr()
sns.heatmap(corr_map,square=True,cmap="YlGnBu")
# K value means how many features required to see in heat map
k=10
# finding the columns which related to output attribute and we are arranging from top coefficient correlation value to downwards.
cols=corr_map.nlargest(k,'status')['status'].index
# correlation coefficient values
coff_values=np.corrcoef(cData[cols].values.T)
sns.set(font_scale=1.25)
sns.heatmap(coff_values,cbar=True,annot=True,square=True,fmt='.2f',cmap="YlGnBu",
annot_kws={'size': 10},yticklabels=cols.values,xticklabels=cols.values)
plt.show()
#Distribution of columns.
sns.pairplot(cData[["name","MDVP:Fo(Hz)","MDVP:Fhi(Hz)","MDVP:Flo(Hz)","MDVP:Jitter(%)","MDVP:Jitter(Abs)","MDVP:RAP","MDVP:PPQ","Jitter:DDP","MDVP:Shimmer","MDVP:Shimmer(dB)","Shimmer:APQ3",
"Shimmer:APQ5","MDVP:APQ","Shimmer:DDA","NHR","HNR","status","RPDE","DFA","spread1","spread2","D2","PPE"]], diag_kind='kde')
# Dropping columns which are highly correlated among each other.
cData.drop('name',axis=1,inplace=True)
cData.drop('Jitter:DDP',axis=1,inplace=True)
cData.drop('DFA',axis=1,inplace=True)
cData.drop('NHR',axis=1,inplace=True)
cData.drop('MDVP:Fhi(Hz)',axis=1,inplace=True)
from sklearn.model_selection import train_test_split
X = cData.drop('status',axis=1) # Predictor feature columns
Y = cData['status'] # Predicted class (1=True, 0=False)
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=1)
# 1 is just any random seed number
x_train.head()
rep_0 = SimpleImputer(missing_values=0, strategy="mean")
cols=x_train.columns
x_train = pd.DataFrame(rep_0.fit_transform(x_train))
x_test = pd.DataFrame(rep_0.fit_transform(x_test))
x_train.columns = cols
x_test.columns = cols
x_train.head()
model = LogisticRegression(solver="liblinear")
model.fit(x_train, y_train)
y_predict = model.predict(x_test)
coef_df = pd.DataFrame(model.coef_)
coef_df['intercept'] = model.intercept_
print(coef_df)
model_score = model.score(x_test, y_test)
print(model_score)
cm=metrics.confusion_matrix(y_test, y_predict, labels=[1, 0])
df_cm = pd.DataFrame(cm, index = [i for i in ["1","0"]],
columns = [i for i in ["Predict 1","Predict 0"]])
plt.figure(figsize = (7,5))
sns.heatmap(df_cm, annot=True)
print("Classification Report")
print(metrics.classification_report(y_test,y_predict, labels=[1, 0]))
cData.groupby(["status"]).count()
from scipy.stats import zscore
XScaled = X.apply(zscore) # convert all attributes to Z scale
XScaled.describe()
X_train, X_test, Y_train, Y_test = train_test_split(XScaled, Y, test_size=0.30, random_state=42)
NNH = KNeighborsClassifier(n_neighbors= 5 , weights = 'distance' )
NNH.fit(X_train, Y_train)
predicted_labels = NNH.predict(X_test)
NNH.score(X_test, Y_test)
cd=confusion_matrix(Y_test, predicted_labels)
print("confusion matirx of KNN classifier = \n",cd)
knnm=metrics.confusion_matrix(Y_test, predicted_labels)
knn_m = pd.DataFrame(knnm, index = [i for i in ["1","0"]],columns = [i for i in ["Predict 1","Predict 0"]])
plt.figure(figsize = (7,5))
sns.heatmap(knn_m, annot=True)
print("Classification Report")
print(metrics.classification_report(Y_test, predicted_labels, labels=[1, 0]))
x_train, x_test, y_train, y_test = train_test_split(X,Y, test_size = 0.3, random_state = 10)
svc_model = SVC(C= .1, kernel='linear', gamma= 1)
svc_model.fit(x_train, y_train)
prediction = svc_model .predict(x_test)
print(svc_model.score(x_train, y_train))
print(svc_model.score(x_test, y_test))
knnm=metrics.confusion_matrix(prediction,y_test)
knn_m = pd.DataFrame(knnm, index = [i for i in ["1","0"]],columns = [i for i in ["Predict 1","Predict 0"]])
plt.figure(figsize = (7,5))
sns.heatmap(knn_m, annot=True)
print("Classification Report")
print(metrics.classification_report(prediction,y_test, labels=[1, 0]))
svc_model = SVC(kernel='rbf')
svc_model.fit(x_train, y_train)
svc_model = SVC(kernel='poly')
svc_model.fit(x_train, y_train)
prediction = svc_model.predict(x_test)
print(svc_model.score(x_train, y_train))
print(svc_model.score(x_test, y_test))
svc_model = SVC(kernel='sigmoid')
svc_model.fit(x_train, y_train)
prediction = svc_model.predict(x_test)
print(svc_model.score(x_train, y_train))
print(svc_model.score(x_test, y_test))
feature_cols = ['MDVP:Fo(Hz)',' MDVP:Flo(Hz)','MDVP:Jitter(%)','MDVP:Jitter(Abs)','MDVP:RAP','MDVP:PPQ','MDVP:Shimmer',
'MDVP:Shimmer(dB)','Shimmer:APQ3','Shimmer:APQ5','MDVP:APQ','Shimmer:DDA','HNR','RPDE','spread1','spread2','D2','PPE']
clf = DecisionTreeClassifier()
clf=clf.fit(x_train,y_train)
y_pred = clf.predict(x_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
from sklearn.tree import export_graphviz
from sklearn.externals.six import StringIO
from IPython.display import Image
import pydotplus
import graphviz
dot_data = StringIO()
export_graphviz(clf, out_file=dot_data,
filled=True, rounded=True,
special_characters=True,feature_names = feature_cols,class_names=['0','1'])
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
graph.write_png('cd.png')
Image(graph.create_png())
clf = DecisionTreeClassifier(criterion="entropy", max_depth=3)
clf = clf.fit(x_train,y_train)
y_pred = clf.predict(x_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
from IPython.display import Image
from sklearn.tree import export_graphviz
import pydotplus
dot_data = StringIO()
export_graphviz(clf, out_file=dot_data,
filled=True, rounded=True,
special_characters=True, feature_names = feature_cols,class_names=['0','1'])
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
graph.write_png('cd1.png')
Image(graph.create_png())
from sklearn.ensemble import RandomForestClassifier
randf = RandomForestClassifier(n_estimators = 50)
randf = randf.fit(x_train, y_train)
predrandf= randf.predict(x_test)
accrandf = accuracy_score(y_test, predrandf)
print(accrandf)
from sklearn.ensemble import AdaBoostClassifier
abc = AdaBoostClassifier(n_estimators=50,learning_rate=1)
model = abc.fit(x_train, y_train)
y_pred = model.predict(x_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
from sklearn.ensemble import BaggingClassifier
bag = BaggingClassifier(n_estimators=50, max_samples= .7, bootstrap=True, oob_score=True, random_state=22)
bag= bag.fit(x_train, y_train)
predBAG =bag.predict(x_test)
accBAG = accuracy_score(y_test, predBAG)
print(accBAG)
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import GradientBoostingClassifier
grad = GradientBoostingClassifier(n_estimators = 50, learning_rate = 0.1, random_state=22)
grad = grad.fit(x_train, y_train)
predgrad =grad.predict(x_test)
accgrad = accuracy_score(y_test, predgrad)
print(accgrad)
from sklearn import model_selection
from mlxtend.classifier import StackingClassifier
clf1 = KNeighborsClassifier(n_neighbors=1)
clf2 = RandomForestClassifier(random_state=1)
clf3 =SVC(C= .1, kernel='linear', gamma= 1)
lr = LogisticRegression()
sclf = StackingClassifier(classifiers=[clf1, clf2, clf3], meta_classifier=lr)
print('3-fold cross validation:\n')
for clf, label in zip([clf1, clf2, clf3, sclf],
['KNN',
'Random Forest',
'Naive Bayes',
'StackingClassifier']):
scores = model_selection.cross_val_score(clf, X, Y,
cv=3, scoring='accuracy')
print("Accuracy: %0.2f (+/- %0.2f) [%s]"
% (scores.mean(), scores.std(), label))
# compare standalone models for binary classification
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from matplotlib import pyplot
# get the dataset
def get_dataset():
X, y = make_classification(n_samples=1000, n_features=20, n_informative=15, n_redundant=5, random_state=1)
return X, y
# get a list of models to evaluate
def get_models():
models = dict()
models['lr'] = LogisticRegression()
models['knn'] = KNeighborsClassifier()
models['cart'] = DecisionTreeClassifier()
models['svm'] = SVC()
models['bayes'] = GaussianNB()
return models
# evaluate a given model using cross-validation
def evaluate_model(model):
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
return scores
# define dataset
X, y = get_dataset()
# get the models to evaluate
models = get_models()
# evaluate the models and store results
results, names = list(), list()
for name, model in models.items():
scores = evaluate_model(model)
results.append(scores)
names.append(name)
print('>%s %.3f (%.3f)' % (name, mean(scores), std(scores)))
# plot model performance for comparison
pyplot.boxplot(results, labels=names, showmeans=True)
pyplot.show()
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import GradientBoostingClassifier
# get the dataset
def get_dataset():
X, y = make_classification(n_samples=1000, n_features=20, n_informative=15, n_redundant=5, random_state=1)
return X, y
# get a list of models to evaluate
def get_models():
models = dict()
models['ab'] = AdaBoostClassifier()
models['bc'] = KNeighborsClassifier()
models['gb'] = GradientBoostingClassifier()
return models
# define dataset
X, y = get_dataset()
# get the models to evaluate
models = get_models()
# evaluate the models and store results
results, names = list(), list()
for name, model in models.items():
scores = evaluate_model(model)
results.append(scores)
names.append(name)
print('>%s %.3f (%.3f)' % (name, mean(scores), std(scores)))
# plot model performance for comparison
pyplot.boxplot(results, labels=names, showmeans=True)
pyplot.show()
On performing several analysis on the given dataset, I conclude that the Gradient Boosting Classifier method is the most accurate for analysis with accuracy of 0.9491525423728814